from __future__ import print_function
import nltk, re, pickle, os
import pandas as pd
import numpy as np
from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize, MWETokenizer
from nltk.stem import porter, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from collections import Counter
from operator import itemgetter
import heapq
import urllib
import urllib.parse
import spacy
nlp = spacy.load('en_core_web_sm')
from urllib.request import urlopen
from lxml import etree
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import requests
import json
import lxml
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
from plotly import tools
import plotly.tools as tls
init_notebook_mode(connected=True)
import ast
import plotly.express as px
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
ted_main = pd.read_csv('ted_main.csv')
ted_trans = pd.read_csv('transcripts.csv')
ted_all = pd.merge(ted_trans,right=ted_main,on='url')
ted_all.head(5)
| transcript | url | comments | description | duration | event | film_date | languages | main_speaker | name | num_speaker | published_date | ratings | related_talks | speaker_occupation | tags | title | views | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Good morning. How are you?(Laughter)It's been ... | https://www.ted.com/talks/ken_robinson_says_sc... | 4553 | Sir Ken Robinson makes an entertaining and pro... | 1164 | TED2006 | 1140825600 | 60 | Ken Robinson | Ken Robinson: Do schools kill creativity? | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 19645}, {... | [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... | Author/educator | ['children', 'creativity', 'culture', 'dance',... | Do schools kill creativity? | 47227110 |
| 1 | Thank you so much, Chris. And it's truly a gre... | https://www.ted.com/talks/al_gore_on_averting_... | 265 | With the same humor and humanity he exuded in ... | 977 | TED2006 | 1140825600 | 43 | Al Gore | Al Gore: Averting the climate crisis | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... | [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... | Climate advocate | ['alternative energy', 'cars', 'climate change... | Averting the climate crisis | 3200520 |
| 2 | (Music: "The Sound of Silence," Simon & Garfun... | https://www.ted.com/talks/david_pogue_says_sim... | 124 | New York Times columnist David Pogue takes aim... | 1286 | TED2006 | 1140739200 | 26 | David Pogue | David Pogue: Simplicity sells | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... | [{'id': 1725, 'hero': 'https://pe.tedcdn.com/i... | Technology columnist | ['computers', 'entertainment', 'interface desi... | Simplicity sells | 1636292 |
| 3 | If you're here today — and I'm very happy that... | https://www.ted.com/talks/majora_carter_s_tale... | 200 | In an emotionally charged talk, MacArthur-winn... | 1116 | TED2006 | 1140912000 | 35 | Majora Carter | Majora Carter: Greening the ghetto | 1 | 1151367060 | [{'id': 3, 'name': 'Courageous', 'count': 760}... | [{'id': 1041, 'hero': 'https://pe.tedcdn.com/i... | Activist for environmental justice | ['MacArthur grant', 'activism', 'business', 'c... | Greening the ghetto | 1697550 |
| 4 | About 10 years ago, I took on the task to teac... | https://www.ted.com/talks/hans_rosling_shows_t... | 593 | You've never seen data presented like this. Wi... | 1190 | TED2006 | 1140566400 | 48 | Hans Rosling | Hans Rosling: The best stats you've ever seen | 1 | 1151440680 | [{'id': 9, 'name': 'Ingenious', 'count': 3202}... | [{'id': 2056, 'hero': 'https://pe.tedcdn.com/i... | Global health expert; data visionary | ['Africa', 'Asia', 'Google', 'demo', 'economic... | The best stats you've ever seen | 12005869 |
with open('ted_all.pkl', 'wb') as picklefile:
pickle.dump(ted_all, picklefile)
ted_all['id'] = ted_all.index
# function to get video id and url from youtube from a given title of a ted talk
def getData(base_url, page_url, query):
page = urlopen(page_url+urllib.parse.quote(query))
# print(page.url)
video_ids = re.findall(r"watch\?v=(\S{11})", page.read().decode())
if video_ids:
link = "/watch?v="+video_ids[0]
video_link = urlopen(base_url+link)
params = {"format": "json", "url": base_url+link}
url = "https://www.youtube.com/oembed"
query_string = urllib.parse.urlencode(params)
url = url + "?" + query_string
with urlopen(url) as response:
response_text = response.read()
data = json.loads(response_text.decode())
query1 = query.lower()
query2 = data['title'].lower()
if query1 not in query2:
return None
else:
return None, None
return video_ids[0]
base_url = "https://www.youtube.com"
page_url = 'https://www.youtube.com/c/TED/search?query='
data = pd.DataFrame(columns=['title', 'video_id'])
i = 0
for t in ted_all['title']:
if i%200==0:
print(i)
vid = getData(base_url, page_url, t)
i += 1
if vid != None:
data = data.append({'title': t, 'video_id': vid}, ignore_index=True)
0 200 400 600 800 1000 1200 1400 1600 1800 2000 2200 2400
df_all = pd.merge(data,right=ted_all,on='title')
with open('df_all.pkl', 'wb') as picklefile:
pickle.dump(df_all, picklefile)
df_all
| title | video_id | transcript | url | comments | description | duration | event | film_date | languages | main_speaker | name | num_speaker | published_date | ratings | related_talks | speaker_occupation | tags | views | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Do schools kill creativity? | iG9CE55wbtY | Good morning. How are you?(Laughter)It's been ... | https://www.ted.com/talks/ken_robinson_says_sc... | 4553 | Sir Ken Robinson makes an entertaining and pro... | 1164 | TED2006 | 1140825600 | 60 | Ken Robinson | Ken Robinson: Do schools kill creativity? | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 19645}, {... | [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... | Author/educator | ['children', 'creativity', 'culture', 'dance',... | 47227110 |
| 1 | Averting the climate crisis | rDiGYuQicpA | Thank you so much, Chris. And it's truly a gre... | https://www.ted.com/talks/al_gore_on_averting_... | 265 | With the same humor and humanity he exuded in ... | 977 | TED2006 | 1140825600 | 43 | Al Gore | Al Gore: Averting the climate crisis | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... | [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... | Climate advocate | ['alternative energy', 'cars', 'climate change... | 3200520 |
| 2 | Simplicity sells | NEjZt0y6OOw | (Music: "The Sound of Silence," Simon & Garfun... | https://www.ted.com/talks/david_pogue_says_sim... | 124 | New York Times columnist David Pogue takes aim... | 1286 | TED2006 | 1140739200 | 26 | David Pogue | David Pogue: Simplicity sells | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... | [{'id': 1725, 'hero': 'https://pe.tedcdn.com/i... | Technology columnist | ['computers', 'entertainment', 'interface desi... | 1636292 |
| 3 | Greening the ghetto | gQ-cZRmHfs4 | If you're here today — and I'm very happy that... | https://www.ted.com/talks/majora_carter_s_tale... | 200 | In an emotionally charged talk, MacArthur-winn... | 1116 | TED2006 | 1140912000 | 35 | Majora Carter | Majora Carter: Greening the ghetto | 1 | 1151367060 | [{'id': 3, 'name': 'Courageous', 'count': 760}... | [{'id': 1041, 'hero': 'https://pe.tedcdn.com/i... | Activist for environmental justice | ['MacArthur grant', 'activism', 'business', 'c... | 1697550 |
| 4 | The best stats you've ever seen | hVimVzgtD6w | About 10 years ago, I took on the task to teac... | https://www.ted.com/talks/hans_rosling_shows_t... | 593 | You've never seen data presented like this. Wi... | 1190 | TED2006 | 1140566400 | 48 | Hans Rosling | Hans Rosling: The best stats you've ever seen | 1 | 1151440680 | [{'id': 9, 'name': 'Ingenious', 'count': 3202}... | [{'id': 2056, 'hero': 'https://pe.tedcdn.com/i... | Global health expert; data visionary | ['Africa', 'Asia', 'Google', 'demo', 'economic... | 12005869 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1977 | What we're missing in the debate about immigra... | 2wu28tI9VkM | So, Ma was trying to explain something to me a... | https://www.ted.com/talks/duarte_geraldino_wha... | 17 | Between 2008 and 2016, the United States depor... | 476 | TED2017 | 1496707200 | 4 | Duarte Geraldino | Duarte Geraldino: What we're missing in the de... | 1 | 1505851216 | [{'id': 3, 'name': 'Courageous', 'count': 24},... | [{'id': 2596, 'hero': 'https://pe.tedcdn.com/i... | Journalist | ['TED Residency', 'United States', 'community'... | 450430 |
| 1978 | The most Martian place on Earth | lWnr-99DNeE | This is a picture of a sunset on Mars taken by... | https://www.ted.com/talks/armando_azua_bustos_... | 6 | How can you study Mars without a spaceship? He... | 290 | TED2017 | 1492992000 | 3 | Armando Azua-Bustos | Armando Azua-Bustos: The most Martian place on... | 1 | 1505919737 | [{'id': 22, 'name': 'Fascinating', 'count': 32... | [{'id': 2491, 'hero': 'https://pe.tedcdn.com/i... | Astrobiologist | ['Mars', 'South America', 'TED Fellows', 'astr... | 417470 |
| 1979 | What intelligent machines can learn from a sch... | 0bRocfcPhHU | In my early days as a graduate student, I went... | https://www.ted.com/talks/radhika_nagpal_what_... | 10 | Science fiction visions of the future show us ... | 651 | TED2017 | 1492992000 | 1 | Radhika Nagpal | Radhika Nagpal: What intelligent machines can ... | 1 | 1506006095 | [{'id': 1, 'name': 'Beautiful', 'count': 14}, ... | [{'id': 2346, 'hero': 'https://pe.tedcdn.com/i... | Robotics engineer | ['AI', 'ants', 'fish', 'future', 'innovation',... | 375647 |
| 1980 | A black man goes undercover in the alt-right | OqUaEJLfrLo | I took a cell phone and accidentally made myse... | https://www.ted.com/talks/theo_e_j_wilson_a_bl... | 32 | In an unmissable talk about race and politics ... | 1100 | TEDxMileHigh | 1499472000 | 1 | Theo E.J. Wilson | Theo E.J. Wilson: A black man goes undercover ... | 1 | 1506024042 | [{'id': 11, 'name': 'Longwinded', 'count': 3},... | [{'id': 2512, 'hero': 'https://pe.tedcdn.com/i... | Public intellectual | ['Internet', 'TEDx', 'United States', 'communi... | 419309 |
| 1981 | How a video game might help us build better ci... | qYUmI5kGsYk | We humans are becoming an urban species, so ci... | https://www.ted.com/talks/karoliina_korppoo_ho... | 8 | With more than half of the world population li... | 519 | TED2017 | 1492992000 | 1 | Karoliina Korppoo | Karoliina Korppoo: How a video game might help... | 1 | 1506092422 | [{'id': 21, 'name': 'Unconvincing', 'count': 2... | [{'id': 2682, 'hero': 'https://pe.tedcdn.com/i... | Game designer | ['cities', 'design', 'future', 'infrastructure... | 391721 |
1982 rows × 19 columns
While looking at initial n-grams, I notice that we get a lot of "thank you applause". So, I started looking at all the non-word behavior that is transcribed (see below). Luckily, they put all of the speaker's parenthetical comments, and some titles of songs(?) played into brackets [ ] and all of the audience sounds, viedos, music, etc in parenthases.
So, it is safe to first go and take out everything that is in parentheses before we even tokenize so that we can just look at speech.
It would be interesting to collect these and keep a count in the main matrix, especially for things like 'laughter' or applause or multimedia (present/not present) in making recommendations or calculating the popularity of a talk.
(Applause)(Applause ends)(Pre-recorded applause)(Pre-recorded applause and cheering)(Audience cheers)(Laughter)(Shouting)(Mock sob)(Breathes in)(Baby cooing)(Video)(Singing)(Heroic music)(Loud music)(Music)(Music ends)(Plays notes)(Sighs)(Clears throat)(Whispering)
Four important steps for cleaning the text and getting it into a format that we can analyze:
def clean_text(text):
lemmizer = WordNetLemmatizer()
stop = stopwords.words('english')
stop += ['.', ',',':','...','!"','?"', "'", '"',' - ',' — ',',"','."','!', ';','♫♫','♫',\
'.\'"','[',']','—',".\'", 'ok','okay','yeah','ya','stuff', ' 000 ',' em ',\
' oh ','thank','thanks','la','was','wa','?','like','go',' le ',' ca ',' I '," ? ","s", " t ","ve","re"]
cleaned_text = []
for post in text:
cleaned_words = []
clean_parens = re.sub(r'\([^)]*\)', ' ', post)
for word in wordpunct_tokenize(clean_parens):
if word.lower() not in stop:
low_word = lemmizer.lemmatize(word)
if low_word.lower() not in stop:
cleaned_words.append(low_word.lower())
cleaned_text.append(' '.join(cleaned_words))
return cleaned_text
df_all['transcript'] = clean_text(df_all['transcript'])
with open('cleaned_talks.pkl', 'wb') as picklefile:
pickle.dump(df_all, picklefile)
df_all['transcript'][0][:300]
'good morning great blown away whole thing fact leaving three theme running conference relevant want talk one extraordinary evidence human creativity presentation people variety range second put u place idea going happen term future idea may play interest education actually find everybody interest ed'
Vectorization is the important step of turning our words into numbers. This function takes each word in each document and counts the number of times the word appears. You end up with each word as your columns and each row is a document (talk), so the data is the frequency of each word in each document, we call this a sparse matrix.
c_vectorizer = CountVectorizer(ngram_range=(1,3),
stop_words='english',
max_df = 0.6,
max_features=10000)
t_vectorizer = TfidfVectorizer(ngram_range=(1, 3),
stop_words='english',
token_pattern="\\b[a-z][a-z]+\\b",
lowercase=True,
max_df = 0.6)
c_vectorizer.fit(df_all['transcript'])
c_x = c_vectorizer.transform(df_all['transcript'])
t_vectorizer.fit(df_all['transcript'])
t_x = t_vectorizer.transform(df_all['transcript'])
Vectorization will come in use with Topic Modelling.
with open('cleaned_talks.pkl', 'rb') as picklefile:
df = pickle.load(picklefile)
len(df)
1982
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english",
use_idf=True,
ngram_range=(1,1),
min_df = 0.05,
max_df = 0.3)
tfidf = vectorizer.fit_transform(df['transcript'])
import seaborn as sns
import matplotlib.pyplot as plt
def rank_words(terms, feature_matrix):
sums = feature_matrix.sum(axis=0)
data = []
for col, term in enumerate(terms):
data.append((term, sums[0,col]))
ranked = pd.DataFrame(data, columns=['term','rank']).sort_values('rank', ascending=False)
return ranked
ranked = rank_words(terms=vectorizer.get_feature_names(), feature_matrix=tfidf)
fig, ax = plt.subplots(figsize=(6,10), ncols=1, nrows=1)
sns.barplot(x='rank',y='term',data=ranked[:20], palette='Blues_r', ax=ax)
<AxesSubplot:xlabel='rank', ylabel='term'>
# Let's visualize a word cloud with the frequencies obtained by idf transformation
dic = {ranked.loc[i,'term'].upper(): ranked.loc[i,'rank'] for i in range(0,len(ranked))}
wordcloud = WordCloud(background_color='white',
max_words=100,
colormap='Reds').generate_from_frequencies(dic)
fig = plt.figure(1,figsize=(12,15))
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis('off')
plt.show()
def sentimental(sent):
sid_obj = SentimentIntensityAnalyzer()
sentiment_dict = sid_obj.polarity_scores(sent)
if sentiment_dict['compound'] >= 0.05:
return "pos"
elif sentiment_dict['compound'] <= -0.05:
return "neg"
else:
return "neu"
for i,sent in enumerate(df['transcript']):
if sentimental(sent) == 'pos':
df.at[i, 'sentiment'] = 'positive'
elif sentimental(sent) == 'neg':
df.at[i, 'sentiment'] = 'negative'
else:`
df.at[i, 'sentiment'] = 'neutral'
df.head()
| title | video_id | transcript | url | comments | description | duration | event | film_date | languages | main_speaker | name | num_speaker | published_date | ratings | related_talks | speaker_occupation | tags | views | sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Do schools kill creativity? | iG9CE55wbtY | good morning great blown away whole thing fact... | https://www.ted.com/talks/ken_robinson_says_sc... | 4553 | Sir Ken Robinson makes an entertaining and pro... | 1164 | TED2006 | 1140825600 | 60 | Ken Robinson | Ken Robinson: Do schools kill creativity? | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 19645}, {... | [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... | Author/educator | ['children', 'creativity', 'culture', 'dance',... | 47227110 | positive |
| 1 | Averting the climate crisis | rDiGYuQicpA | much chris truly great honor opportunity come ... | https://www.ted.com/talks/al_gore_on_averting_... | 265 | With the same humor and humanity he exuded in ... | 977 | TED2006 | 1140825600 | 43 | Al Gore | Al Gore: Averting the climate crisis | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... | [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... | Climate advocate | ['alternative energy', 'cars', 'climate change... | 3200520 | positive |
| 2 | Simplicity sells | NEjZt0y6OOw | hello voice mail old friend called tech suppor... | https://www.ted.com/talks/david_pogue_says_sim... | 124 | New York Times columnist David Pogue takes aim... | 1286 | TED2006 | 1140739200 | 26 | David Pogue | David Pogue: Simplicity sells | 1 | 1151367060 | [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... | [{'id': 1725, 'hero': 'https://pe.tedcdn.com/i... | Technology columnist | ['computers', 'entertainment', 'interface desi... | 1636292 | positive |
| 3 | Greening the ghetto | gQ-cZRmHfs4 | today happy heard sustainable development save... | https://www.ted.com/talks/majora_carter_s_tale... | 200 | In an emotionally charged talk, MacArthur-winn... | 1116 | TED2006 | 1140912000 | 35 | Majora Carter | Majora Carter: Greening the ghetto | 1 | 1151367060 | [{'id': 3, 'name': 'Courageous', 'count': 760}... | [{'id': 1041, 'hero': 'https://pe.tedcdn.com/i... | Activist for environmental justice | ['MacArthur grant', 'activism', 'business', 'c... | 1697550 | positive |
| 4 | The best stats you've ever seen | hVimVzgtD6w | 10 year ago took task teach global development... | https://www.ted.com/talks/hans_rosling_shows_t... | 593 | You've never seen data presented like this. Wi... | 1190 | TED2006 | 1140566400 | 48 | Hans Rosling | Hans Rosling: The best stats you've ever seen | 1 | 1151440680 | [{'id': 9, 'name': 'Ingenious', 'count': 3202}... | [{'id': 2056, 'hero': 'https://pe.tedcdn.com/i... | Global health expert; data visionary | ['Africa', 'Asia', 'Google', 'demo', 'economic... | 12005869 | positive |
df['sentiment'].iplot(
kind='hist',
bins=50,
xTitle='polarity',
linecolor='black',
yTitle='count',
title='Sentiment Polarity Distribution')
df['comments'].iplot(
kind='hist',
bins=50,
xTitle='comments',
linecolor='black',
yTitle='count',
title='No. of Comments')
df['views'].iplot(
kind='hist',
bins=50,
xTitle='views',
linecolor='black',
yTitle='count',
title='No. of Views')
dic = {'Expression':[], 'Count': []}
for i in df['ratings']:
i = ast.literal_eval(i)
for j in i:
dic['Expression'].append(j['name'])
dic['Count'].append(j['count'])
fig = px.pie(dic, names='Expression', values='Count')
fig.show()
dic = {}
for i in df['tags']:
i = ast.literal_eval(i)
for j in i:
if j in dic:
dic[j] += 1
else:
dic[j] = 1
keys = [*dic.keys()]
values = [*dic.values()]
dic = {'Tags':keys, 'Count':values}
fig = px.bar(dic, x='Tags', y='Count')
fig.show()
def senti(sent):
sid_obj = SentimentIntensityAnalyzer()
sentiment_dict = sid_obj.polarity_scores(sent)
if sentiment_dict['compound'] >= 0.05:
return 1
elif sentiment_dict['compound'] <= -0.05:
return -1
else:
return 0
dic = {'Expression':[], 'Sentiment':[]}
for i in df['ratings']:
i = ast.literal_eval(i)
for j in i:
dic['Expression'].append(j['name'])
dic['Expression'] = np.unique(dic['Expression'])
for i in dic['Expression']:
dic['Sentiment'].append(senti(i))
fig = px.bar(dic, x='Expression', y='Sentiment')
fig.show()
dic = {'Tags':[], 'Sentiment':[]}
for i in df['tags']:
i = ast.literal_eval(i)
for j in i:
dic['Tags'].append(j)
dic['Tags'] = np.unique(dic['Tags'])
for i in dic['Tags']:
dic['Sentiment'].append(senti(i))
fig = px.bar(dic, x='Tags', y='Sentiment')
fig.show()